setwd("C:/Users/reino/Dropbox/[Dissertation] Politicization and Economic Governance/Quant Paper 1 - CSRs")
getwd()

library(SnowballC)
library(tm)
library(quanteda)
library(topicmodels)

## Loading in data, target is folder with recommendations in txt files
files <- Corpus(DirSource("CSRs - Rec only", recursive=TRUE, encoding = "UTF-8"), readerControl = list(language = "en"))  ## Loads all files at once, recursive means it also goes into all directories nested in there # Corpus is function

texts <- files
wcm <- DocumentTermMatrix(texts)

## Creating metadata - Obtaining document information -
## notice the format of the filenames: csr[year][ms].txt
rownames(wcm)[1:10]
data <- data.frame(rownames(wcm),stringsAsFactors =FALSE)
colnames(data)<-c("rownames")
data$year<-""
data$ms<-""

year_ms <- 1:length(data$rownames)
for(i in 1:length(data$rownames)){
  year_ms[i] <- gsub("csr","",data$rownames[i])  
  year_ms[i] <- gsub(".txt","",year_ms[i])
  data$year[i] <- gsub("[[:alpha:]]+","",year_ms[i])
  data$ms[i] <- gsub("[[:digit:]]+","",year_ms[i])
}

library(plyr)
data$ms <- revalue(data$ms, c("france" = "FRA",
                              "belgium" = "BEL",
                              "netherlands" = "NL",
                              "italy" = "ITA",
                              "luxembourg" = "LUX",
                              "denmark" = "DEN",
                              "ireland" = "IRE",
                              "spain" = "SPA",
                              "portugal" = "POR",
                              "finland" = "FIN",
                              "sweden" = "SWE",
                              "austria" = "AUS",
                              "cyprus" = "CYP",
                              "czechrepublic" = "CZE",
                              "estonia" = "EST",
                              "hungary" = "HUN",
                              "latvia" = "LAT",
                              "lithuania" = "LIT",
                              "malta" = "MAL",
                              "poland" = "POL",
                              "slovakia" = "SLK",
                              "slovenia" = "SLV",
                              "bulgaria" = "BUL",
                              "romania" = "ROM",
                              "croatia" = "CRO",
                              "germany" = "GER",
                              "uk" = "UK"))
unique(data$ms)

data$ID <- paste0(data$ms, data$year)

## Code below breaks up country-year files to individual recommendations to be able to measure socialness per recommendation
splitfiles <- 1:length(files)
files <- tm_map(files, stripWhitespace)

for (i in seq_along(files)) {
  splitfiles[i] <- as.list(strsplit(files[i]$content, " \\(\\d\\) | \\d\\. | \\(\\d{2}\\) | \\d{2}\\. "))
}
splitfiles[[2]][2]

for (i in seq_along(splitfiles)) {
  splitfiles[[i]] <- splitfiles[[i]][-1]
}

names(splitfiles) <- data$ID
for (i in seq_along(splitfiles)) {
  names(splitfiles[[i]]) <- paste(names(splitfiles[i]), 1:length(splitfiles[[i]]), sep = "_")
}
splitfiles[[174]][1]

mainDir <- "C:/Users/reino/Dropbox/[Dissertation] Politicization and Economic Governance/Quant Paper 1 - CSRs"
subDir <- "Split recs"

#dir.create(file.path(mainDir, subDir))
setwd(file.path(mainDir, subDir))
for (i in seq_along(splitfiles)) {
  file.create(names(splitfiles[[i]]))
}

filenames <- list.files()
splitrecs <- 1:length(filenames)
for (i in seq_along(splitfiles)) {
  splitfiles[[i]] <- as.data.frame(splitfiles[[i]])
  splitfiles[[i]]$ID <- rownames(splitfiles[[i]])
}

library(reshape)
df <- merge_recurse(splitfiles)
df$ms <- gsub("\\d", "", df$ID)
df$ms <- gsub("\\_", "", df$ms)
df$year <- gsub("\\_\\d*$", "", df$ID)
df$year <- gsub("\\D", "", df$year)
colnames(df)[1] <- "text"

texts <- Corpus(VectorSource(df$text))
inspect(texts)
DocumentTermMatrix(texts)

## Document cleaning part I
#Removing whitespaces
texts<-tm_map(texts,stripWhitespace)

#Removing punctuation
removeNonAlnum <- function(x){
  gsub("[^[:alnum:]^[:space:]]"," ",x)
}
texts <- tm_map(texts,removeNonAlnum)

#Converting to Lower Case
texts<-tm_map(texts,content_transformer(tolower))

#Removing numbers
texts<-tm_map(texts, removeNumbers)

#Removing whitespaces again
texts<-tm_map(texts,stripWhitespace)

#Generate Word Count Matrix
wcm <- DocumentTermMatrix(texts)

inspect(texts[1])

### Document Cleaning PART II - Deleting stopwords, url's, infrequent terms
#Removing URL's www, http
currentListOfWords<-colnames(wcm)
urls<-currentListOfWords[grep("^www",currentListOfWords)] # Words that start (^) with "www"
urls<-c(urls,currentListOfWords[grep("^http",currentListOfWords)]) # Words that start (^) with "http"
texts<-tm_map(texts,removeWords,urls)    ## Removes words from texts

#Removing stop words
texts<-tm_map(texts, removeWords, stopwords("english"))       ## Issue: these words have punctuation, so it doesn't delete "hasn't", for example. But doing this earlier also has disadvantages because some of the words will be capitalized etc. Inaki says pick your battles.

#Removing names
country_names <- data$ms
texts<-tm_map(texts,removeWords,tolower(country_names))
namess <- paste0(unique(country_names), "s")
namess <- c(namess, "czech", "republic", "republics", "united", "kingdom", "kingdoms")
texts<-tm_map(texts,removeWords,tolower(namess))
rm(namess)

#Removing other words
otherWords<-c("european","commission","council", "recommendation", "article", "opinion", "programme", "en", "ec",
              "take", "measures", "foreseen", "no", "january", "february", "march", "april", "may", "june", "july",
              "august", "september", "october", "november", "december", "austrian","belgian","bulgarian","croatian",
              "cypriot","danish","estonian","finnish","french","german","hungarian","irish","italian","latvian",
              "lithuanian","luxembourgish","maltese","dutch","polish","romanian","slovakian","slovenian","spanish",
              "swedish","british","kst","netherland","final","com","roma","","","","","","","","","","","","","","")
texts<-tm_map(texts,removeWords,otherWords)

#Stemming
stems<-tm_map(texts, stemDocument)

#Remove frequent/infrequent stems
Scm <- DocumentTermMatrix(stems)
wordFrequency <- colSums(as.matrix(Scm) != 0)/length(as.matrix(Scm)[,1]) #Estimates the frequency with which each stem appears in dataset

#Take a look at the descriptive statistics so you can make your choice of cutpoints
hist(wordFrequency)
summary(wordFrequency)

## Taking a look at which words occur often/rarely
frequent<-names(wordFrequency[wordFrequency>0.60])
infrequent<-names(wordFrequency[(wordFrequency<0.01)])
summary(infrequent)

table(as.matrix(Scm[,colnames(Scm)=="implement"]>0)) ## Shows (number of) documents where word occurs, remove table and see

#remove infrequent words
stemsClean<-tm_map(stems, removeWords, infrequent)
stemsClean<-tm_map(stems, removeWords, frequent)

#Generate the Stem Count Matrix
Scm <- DocumentTermMatrix(stemsClean)

mytf <- as.dfm(as.matrix(Scm))

## Adding metadata to fdm
rownames(mytf) <- df$ID

## TOpic model
tmtf <- mytf[which(rowSums(mytf) > 0),]
tmtf <- convert(tmtf, to = "topicmodels")

## Determining number of topics using LDAtuning
## Important note: because models are generative, outcomes will differ marginally each time. Multiple repetitions
## of this tool always range between 29-31 topics, with 30 as median outcome value.
#install.packages("ldatuning")
library(ldatuning)
result <- FindTopicsNumber(
  tmtf, topics = seq(from = 10, to = 200, by = 10),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  mc.cores = 4L,
  verbose = T
)
FindTopicsNumber_plot(result)

result <- FindTopicsNumber(
  tmtf, topics = seq(from = 20, to = 50, by = 1),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  mc.cores = 4L,
  verbose = T
)
FindTopicsNumber_plot(result)

## Topic model with 30 topics. Here too outcomes will vary marginally due to generative nature of models. Also, 
## topics will not be ordered the same way as for the model reported in the paper. Topics will only differ marginally,
## but topic 1 may correspond to topic 15 as reported in the paper, etc.
topicmodel1 <- LDA(tmtf, method="VEM", k = 30)
get_terms(topicmodel1, k=20)
get_topics(topicmodel1)
postTopics <- data.frame(posterior(topicmodel1)$topics)
cors <- cor(postTopics)
max(postTopics[postTopics!=max(postTopics)])
postTopics$ID <- rownames(postTopics)

postTopics <- merge(postTopics, df, by = "ID")

postTopics$score <- 1:nrow(postTopics)
postTopics[, "score"] <- apply(postTopics[, 2:31], 1, max)
postTopics[, "max"] <- apply(postTopics[, 2:31], 1, which.max)
#write.csv2(postTopics, file = "postTopics.csv")

table(postTopics$max)
mean(postTopics$score)

library(plyr)
bymsTopics <-ddply(postTopics, "ms", numcolwise(mean))
barplot(bymsTopics$X8, names.arg=bymsTopics$ms, beside=TRUE)

byyearTopics <-ddply(postTopics, "year", numcolwise(mean))
barplot(byyearTopics$X3, names.arg=byyearTopics$year, beside=TRUE)